I. Preparing data

# clean data 
Fullerton <- 
  read.csv("FullertonHousing.csv") %>% 
  mutate(LOT_SIZE = ifelse(is.na(LOT_SIZE), SQUARE_FEET, LOT_SIZE),
         ZIP = as.factor(ZIP)) %>% 
  select(PROPERTY_TYPE,ZIP,PRICE,DAYS_ON_MARKET,YEAR_BUILT,LOT_SIZE,SQUARE_FEET)

# check sample size
table(Fullerton[,1:2]) %>% 
  kable(caption = "Sample sizes per zip code per property tpye") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Table 1: Sample sizes per zip code per property tpye
92831 92832 92833 92835
Condo/Co-op 10 13 3 10
Single Family Residential 48 26 16 56
Townhouse 7 2 0 4

II. Clustering per zip code

ZIP 92835

EDA

zip_code = 92835

subset = Fullerton %>% filter(ZIP == zip_code) %>% select(-ZIP) %>% droplevels()
X = subset[,-1] %>% as.matrix()
rownames(X) = paste0('id_', 1:nrow(X))
subset %>% 
  reshape2::melt(id.vars = 'PROPERTY_TYPE') %>% 
  ggplot(aes(x = value, fill = PROPERTY_TYPE))+
  geom_histogram() +
  scale_fill_jco() +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(x = "")

clPairs(X, subset[,1])

Hierarchical clustering

# scale the data
X_scaled = scale(X)

# compute the euclidean distance
d <- dist(X_scaled, method = "euclidean")

# compute hierarchical clustering 
hc <- hclust(d, method = "complete" ) #Options: "average", "single", "complete", "ward"

Optimal clusters

# elbow method for optimal clusters
fviz_nbclust(X, FUN = hcut, method = "wss")

# # Other methods
# # a. average silhouette method
# fviz_nbclust(X, FUN = hcut, method = "silhouette")
# # b. gap statistic method
# gap_stat <- cluster::clusGap(X, FUN = hcut, nstart = 25, K.max = 10, B = 50)
# fviz_gap_stat(gap_stat)

Cut-off tree

# cut tree
cut_off <- 4
myCluster <- cutree(hc, cut_off)

# plot dendrogram
fviz_dend(hc, 
          k = cut_off,                 
          cex = 0.5,             
          k_colors = "jco",
          color_labels_by_k = FALSE,
          horiz = TRUE,
          ggtheme = theme_minimal(),
          main = ""
          )

Model-based clustering

Find optimal components

BIC <- mclustBIC(X)  # not necessary to sclae
plot(BIC)

summary(BIC)
## Best BIC values:
##              VEE,3        VEE,5        EEE,5
## BIC      -5617.057 -5618.739033 -5623.865588
## BIC diff     0.000    -1.682532    -6.809087
# There are other criteria, such as ICL (integrated classification likelihood)
# ICL <- mclustICL(X)
# plot(ICL)
# summary(ICL)

Fit with optimal components

model <- Mclust(X, x = BIC)
summary(model, parameters = TRUE)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 3 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -2736.304 70 34 -5617.057 -5622.855
## 
## Clustering table:
##  1  2  3 
## 21 42  7 
## 
## Mixing probabilities:
##          1          2          3 
## 0.28231665 0.61814819 0.09953516 
## 
## Means:
##                        [,1]         [,2]         [,3]
## PRICE          667887.99878 968095.62657 3.900169e+05
## DAYS_ON_MARKET     51.53225     43.44039 6.448473e+00
## YEAR_BUILT       1965.52891   1973.82742 1.976135e+03
## LOT_SIZE         6927.12401  11176.95928 8.381570e+02
## SQUARE_FEET      1550.83838   2654.35600 1.022338e+03
## 
## Variances:
## [,,1]
##                        PRICE DAYS_ON_MARKET    YEAR_BUILT      LOT_SIZE
## PRICE          19657630003.0  193609.292560 -1.778730e+05 216367710.345
## DAYS_ON_MARKET      193609.3     162.270992 -1.612462e+00      2438.303
## YEAR_BUILT         -177873.0      -1.612462  5.845489e+01    -13685.932
## LOT_SIZE         216367710.3    2438.302617 -1.368593e+04   8718113.292
## SQUARE_FEET       52212490.8     894.193568 -1.751302e+02    303765.679
##                  SQUARE_FEET
## PRICE          52212490.8166
## DAYS_ON_MARKET      894.1936
## YEAR_BUILT         -175.1302
## LOT_SIZE         303765.6786
## SQUARE_FEET      181761.2243
## [,,2]
##                        PRICE DAYS_ON_MARKET    YEAR_BUILT      LOT_SIZE
## PRICE           1.026058e+11   1.010571e+06 -9.284334e+05 1129361761.15
## DAYS_ON_MARKET  1.010571e+06   8.469963e+02 -8.416474e+00      12727.06
## YEAR_BUILT     -9.284334e+05  -8.416474e+00  3.051135e+02     -71435.65
## LOT_SIZE        1.129362e+09   1.272706e+04 -7.143565e+04   45505421.15
## SQUARE_FEET     2.725305e+08   4.667369e+03 -9.141168e+02    1585547.78
##                  SQUARE_FEET
## PRICE           2.725305e+08
## DAYS_ON_MARKET  4.667369e+03
## YEAR_BUILT     -9.141168e+02
## LOT_SIZE        1.585548e+06
## SQUARE_FEET     9.487283e+05
## [,,3]
##                        PRICE DAYS_ON_MARKET    YEAR_BUILT     LOT_SIZE
## PRICE          2753589304.48  27120.2824124 -2.491599e+04 30308221.945
## DAYS_ON_MARKET      27120.28     22.7304954 -2.258695e-01      341.551
## YEAR_BUILT         -24915.99     -0.2258695  8.188207e+00    -1917.089
## LOT_SIZE         30308221.94    341.5510418 -1.917089e+03  1221210.467
## SQUARE_FEET       7313788.91    125.2562921 -2.453178e+01    42550.700
##                  SQUARE_FEET
## PRICE          7313788.90795
## DAYS_ON_MARKET     125.25629
## YEAR_BUILT         -24.53178
## LOT_SIZE         42550.70035
## SQUARE_FEET      25460.63605

Plot the results

plot(model, what = "classification")

ZIP 92831

EDA

zip_code = 92831

subset = Fullerton %>% filter(ZIP == zip_code) %>% select(-ZIP) %>% droplevels()
X = subset[,-1] %>% as.matrix()
rownames(X) = paste0('id_', 1:nrow(X))
subset %>% 
  reshape2::melt(id.vars = 'PROPERTY_TYPE') %>% 
  ggplot(aes(x = value, fill = PROPERTY_TYPE))+
  geom_histogram() +
  scale_fill_jco() +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(x = "")

clPairs(X, subset[,1])

Hierarchical clustering

# scale the data
X_scaled = scale(X)

# compute the euclidean distance
d <- dist(X_scaled, method = "euclidean")

# compute hierarchical clustering 
hc <- hclust(d, method = "complete" ) #Options: "average", "single", "complete", "ward"

Optimal clusters

# elbow method for optimal clusters
fviz_nbclust(X, FUN = hcut, method = "wss")

Cut-off tree

# cut tree
cut_off <- 4
myCluster <- cutree(hc, cut_off)

# plot dendrogram
fviz_dend(hc, 
          k = cut_off,                 
          cex = 0.5,             
          k_colors = "jco",
          color_labels_by_k = FALSE,
          horiz = TRUE,
          ggtheme = theme_minimal(),
          main = ""
          )

Model-based clustering

Find optimal components

BIC <- mclustBIC(X)  # not necessary to sclae
plot(BIC)

summary(BIC)
## Best BIC values:
##              VEE,6        VEE,4       VEE,5
## BIC      -5146.957 -5153.126429 -5158.46020
## BIC diff     0.000    -6.169585   -11.50335

Fit with optimal components

model <- Mclust(X, x = BIC)
summary(model, parameters = TRUE)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 6 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -2458.683 65 55 -5146.957 -5149.174
## 
## Clustering table:
##  1  2  3  4  5  6 
##  9 17 20  6  4  9 
## 
## Mixing probabilities:
##          1          2          3          4          5          6 
## 0.13841098 0.24952692 0.32122383 0.09080995 0.06157002 0.13845830 
## 
## Means:
##                        [,1]         [,2]         [,3]         [,4]         [,5]
## PRICE          481361.71796 653596.44670 848196.20168 662465.01552 1.375982e+06
## DAYS_ON_MARKET     68.19777     61.27828     44.24036     16.16969 3.251576e+01
## YEAR_BUILT       1973.77774   1957.48136   1955.65164   1955.87847 1.969746e+03
## LOT_SIZE         1250.11315   8091.01965  11785.36048   7467.40520 4.299286e+04
## SQUARE_FEET      1415.78910   1440.56997   2214.82928   1432.38862 3.000589e+03
##                        [,6]
## PRICE          409229.50300
## DAYS_ON_MARKET     25.36325
## YEAR_BUILT       1972.66828
## LOT_SIZE          949.99183
## SQUARE_FEET      1099.30977
## 
## Variances:
## [,,1]
##                       PRICE DAYS_ON_MARKET   YEAR_BUILT      LOT_SIZE
## PRICE          7531919254.5  -654708.29979 334073.69674 103477482.100
## DAYS_ON_MARKET    -654708.3      205.38325    -41.95925     -4967.801
## YEAR_BUILT         334073.7      -41.95925     35.02141      4018.682
## LOT_SIZE        103477482.1    -4967.80132   4018.68190   3477566.051
## SQUARE_FEET      21947429.7    -1900.59588    984.05288    373156.950
##                  SQUARE_FEET
## PRICE          21947429.7422
## DAYS_ON_MARKET    -1900.5959
## YEAR_BUILT          984.0529
## LOT_SIZE         373156.9499
## SQUARE_FEET       89588.7694
## [,,2]
##                       PRICE DAYS_ON_MARKET   YEAR_BUILT     LOT_SIZE
## PRICE          5975207116.2  -519391.87871 265026.67682 82090549.104
## DAYS_ON_MARKET    -519391.9      162.93423    -33.28703    -3941.046
## YEAR_BUILT         265026.7      -33.28703     27.78312     3188.093
## LOT_SIZE         82090549.1    -3941.04620   3188.09269  2758815.743
## SQUARE_FEET      17411291.1    -1507.77692    780.66686   296032.125
##                  SQUARE_FEET
## PRICE          17411291.0596
## DAYS_ON_MARKET    -1507.7769
## YEAR_BUILT          780.6669
## LOT_SIZE         296032.1250
## SQUARE_FEET       71072.3834
## [,,3]
##                      PRICE DAYS_ON_MARKET   YEAR_BUILT     LOT_SIZE
## PRICE          54071055490  -4700099.3520 2398288.7740 742856699.28
## DAYS_ON_MARKET    -4700099      1474.4302    -301.2221    -35663.45
## YEAR_BUILT         2398289      -301.2221     251.4160     28849.80
## LOT_SIZE         742856699    -35663.4546   28849.8011  24965172.96
## SQUARE_FEET      157558871    -13644.2282    7064.4381   2678864.37
##                  SQUARE_FEET
## PRICE          157558870.636
## DAYS_ON_MARKET    -13644.228
## YEAR_BUILT          7064.438
## LOT_SIZE         2678864.372
## SQUARE_FEET       643150.724
## [,,4]
##                       PRICE DAYS_ON_MARKET   YEAR_BUILT     LOT_SIZE
## PRICE          3220269499.0  -279920.30946 142833.09469 44241762.051
## DAYS_ON_MARKET    -279920.3       87.81154    -17.93966    -2123.982
## YEAR_BUILT         142833.1      -17.93966     14.97339     1718.186
## LOT_SIZE         44241762.1    -2123.98175   1718.18607  1486832.175
## SQUARE_FEET       9383616.1     -812.59912    420.73147   159543.126
##                 SQUARE_FEET
## PRICE          9383616.0736
## DAYS_ON_MARKET    -812.5991
## YEAR_BUILT         420.7315
## LOT_SIZE        159543.1262
## SQUARE_FEET      38303.6477
## [,,5]
##                       PRICE DAYS_ON_MARKET   YEAR_BUILT      LOT_SIZE
## PRICE          101520106921   -8824584.326 4502862.5862 1394736811.67
## DAYS_ON_MARKET     -8824584       2768.289    -565.5540     -66959.26
## YEAR_BUILT          4502863       -565.554     472.0414      54166.41
## LOT_SIZE         1394736812     -66959.257   54166.4087   46872897.26
## SQUARE_FEET       295821734     -25617.468   13263.7048    5029652.09
##                 SQUARE_FEET
## PRICE          295821733.98
## DAYS_ON_MARKET    -25617.47
## YEAR_BUILT         13263.70
## LOT_SIZE         5029652.09
## SQUARE_FEET      1207535.71
## [,,6]
##                       PRICE DAYS_ON_MARKET   YEAR_BUILT     LOT_SIZE
## PRICE          5415200499.2  -470713.58468 240187.92399 74396882.626
## DAYS_ON_MARKET    -470713.6      147.66376    -30.16731    -3571.685
## YEAR_BUILT         240187.9      -30.16731     25.17924     2889.299
## LOT_SIZE         74396882.6    -3571.68462   2889.29919  2500254.820
## SQUARE_FEET      15779475.1    -1366.46549    707.50143   268287.489
##                  SQUARE_FEET
## PRICE          15779475.1219
## DAYS_ON_MARKET    -1366.4655
## YEAR_BUILT          707.5014
## LOT_SIZE         268287.4886
## SQUARE_FEET       64411.3582

Plot the results

plot(model, what = "classification")

III. Clustering per house type

Single Family Residential

EDA

property_type = "Single Family Residential"

subset = Fullerton %>% filter(PROPERTY_TYPE == property_type) %>% select(-PROPERTY_TYPE) %>% droplevels()
X = subset[,-1] %>% as.matrix()
rownames(X) = paste0('id_', 1:nrow(X))
clPairs(X, subset[,1])

Hierarchical clustering

# scale the data
X_scaled = scale(X)

# compute the euclidean distance
d <- dist(X_scaled, method = "euclidean")

# compute hierarchical clustering 
hc <- hclust(d, method = "complete" ) #Options: "average", "single", "complete", "ward"

Optimal clusters

# elbow method for optimal clusters
fviz_nbclust(X, FUN = hcut, method = "wss")

Cut-off tree

# cut tree
cut_off <- 4
myCluster <- cutree(hc, cut_off)

# plot dendrogram
fviz_dend(hc, 
          k = cut_off,                 
          cex = 0.5,             
          k_colors = "jco",
          color_labels_by_k = FALSE,
          horiz = TRUE,
          ggtheme = theme_minimal(),
          main = ""
          )

Model-based clustering

Find optimal components

BIC <- mclustBIC(X)  # not necessary to sclae
plot(BIC)

summary(BIC)
## Best BIC values:
##              VVE,3        VVE,4        VVE,2
## BIC      -11637.99 -11655.56603 -11673.11340
## BIC diff      0.00    -17.57758    -35.12495

Fit with optimal components

model <- Mclust(X, x = BIC)
summary(model, parameters = TRUE)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 3 components: 
## 
##  log-likelihood   n df       BIC      ICL
##       -5714.338 146 42 -11637.99 -11666.2
## 
## Clustering table:
##  1  2  3 
## 40 55 51 
## 
## Mixing probabilities:
##         1         2         3 
## 0.2591771 0.3791035 0.3617193 
## 
## Means:
##                        [,1]         [,2]         [,3]
## PRICE          622403.45582 701854.96233 1.076618e+06
## DAYS_ON_MARKET     50.38175     43.71939 4.458788e+01
## YEAR_BUILT       1956.09933   1956.85446 1.969576e+03
## LOT_SIZE         7826.73598   7193.81115 1.584944e+04
## SQUARE_FEET      1418.77763   1754.17407 2.858298e+03
## 
## Variances:
## [,,1]
##                        PRICE DAYS_ON_MARKET   YEAR_BUILT      LOT_SIZE
## PRICE          6203302271.97  -1.646795e+05 96871.649785 104558399.623
## DAYS_ON_MARKET    -164679.53   6.328525e+02    -6.158904     -2830.019
## YEAR_BUILT          96871.65  -6.158904e+00     8.355735      1175.552
## LOT_SIZE        104558399.62  -2.830019e+03  1175.551913   2320537.424
## SQUARE_FEET      13984854.88  -1.961888e+02   281.758561    229008.286
##                  SQUARE_FEET
## PRICE          13984854.8790
## DAYS_ON_MARKET     -196.1888
## YEAR_BUILT          281.7586
## LOT_SIZE         229008.2855
## SQUARE_FEET       65471.8151
## [,,2]
##                        PRICE DAYS_ON_MARKET    YEAR_BUILT      LOT_SIZE
## PRICE          15023922025.9  -3.988369e+05 234653.972562  2.531856e+08
## DAYS_ON_MARKET     -398836.9   5.461298e+02     -6.057833 -7.132905e+03
## YEAR_BUILT          234654.0  -6.057833e+00    355.189283  5.650821e+02
## LOT_SIZE         253185613.5  -7.132905e+03    565.082097  8.400783e+06
## SQUARE_FEET       33870770.7  -3.099189e+02    761.746460  5.193118e+05
##                  SQUARE_FEET
## PRICE          33870770.7400
## DAYS_ON_MARKET     -309.9189
## YEAR_BUILT          761.7465
## LOT_SIZE         519311.7623
## SQUARE_FEET      188844.1271
## [,,3]
##                        PRICE DAYS_ON_MARKET   YEAR_BUILT     LOT_SIZE
## PRICE          56883073078.9  -1.509942e+06 889398.89910 957434006.62
## DAYS_ON_MARKET    -1509942.3   6.887649e+02    -15.82653    -34030.10
## YEAR_BUILT          889398.9  -1.582653e+01    458.51636    -54879.96
## LOT_SIZE         957434006.6  -3.403010e+04 -54879.96388 101251369.00
## SQUARE_FEET      128255752.2  -1.935916e+03   3339.61103   1072249.31
##                  SQUARE_FEET
## PRICE          128255752.192
## DAYS_ON_MARKET     -1935.916
## YEAR_BUILT          3339.611
## LOT_SIZE         1072249.312
## SQUARE_FEET       562594.734

Plot the results

plot(model, what = "classification")